!jupyter nbconvert gapminder_clean-v4.ipynb --to html
[NbConvertApp] Converting notebook gapminder_clean-v4.ipynb to html [NbConvertApp] Writing 4466876 bytes to gapminder_clean-v4.html
import pandas as pd
import plotly.express as px
from scipy.stats import pearsonr
from scipy import stats
import seaborn as sns
from matplotlib import pyplot as plt
df = pd.read_csv('gapminder_clean.csv')
df.head()
| Unnamed: 0 | Country Name | Year | Agriculture, value added (% of GDP) | CO2 emissions (metric tons per capita) | Domestic credit provided by financial sector (% of GDP) | Electric power consumption (kWh per capita) | Energy use (kg of oil equivalent per capita) | Exports of goods and services (% of GDP) | Fertility rate, total (births per woman) | GDP growth (annual %) | Imports of goods and services (% of GDP) | Industry, value added (% of GDP) | Inflation, GDP deflator (annual %) | Life expectancy at birth, total (years) | Population density (people per sq. km of land area) | Services, etc., value added (% of GDP) | pop | continent | gdpPercap | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | Afghanistan | 1962 | NaN | 0.073781 | 21.276422 | NaN | NaN | 4.878051 | 7.450 | NaN | 9.349593 | NaN | NaN | 33.219902 | 14.312061 | NaN | 10267083.0 | Asia | 853.100710 |
| 1 | 1 | Afghanistan | 1967 | NaN | 0.123782 | 9.917662 | NaN | NaN | 6.772908 | 7.450 | NaN | 14.209827 | NaN | NaN | 35.389415 | 15.881812 | NaN | 11537966.0 | Asia | 836.197138 |
| 2 | 2 | Afghanistan | 1972 | NaN | 0.130820 | 18.880833 | NaN | NaN | 14.763231 | 7.450 | NaN | 18.105850 | NaN | NaN | 37.610146 | 17.947027 | NaN | 13079460.0 | Asia | 739.981106 |
| 3 | 3 | Afghanistan | 1977 | NaN | 0.183118 | 13.836822 | NaN | NaN | 11.662904 | 7.449 | NaN | 14.823175 | NaN | NaN | 40.110146 | 19.998926 | NaN | 14880372.0 | Asia | 786.113360 |
| 4 | 4 | Afghanistan | 1982 | NaN | 0.165879 | NaN | NaN | NaN | NaN | 7.450 | NaN | NaN | NaN | NaN | 43.230732 | 19.402324 | NaN | 12881816.0 | Asia | 978.011439 |
df = df[df['Year'] == 1962]
df['Year'].value_counts()
1962 259 Name: Year, dtype: int64
sns.scatterplot(data=df, x='CO2 emissions (metric tons per capita)', y="gdpPercap")
<AxesSubplot:xlabel='CO2 emissions (metric tons per capita)', ylabel='gdpPercap'>
new = df[~df['CO2 emissions (metric tons per capita)'].isnull()]
new = new[~new['gdpPercap'].isnull()]
pearsonr(new['CO2 emissions (metric tons per capita)'],new["gdpPercap"])
(0.9260816725019472, 1.1286792210038754e-46)
df_1 = pd.read_csv('gapminder_clean.csv')
years_list = list(df_1['Year'].value_counts().index)
print(years_list)
[2007, 2002, 1992, 1997, 1987, 1982, 1977, 1967, 1972, 1962]
my_dict = {}
for i in years_list:
df_2 = df_1[df_1['Year'] == i]
new = df_2[~df_2['CO2 emissions (metric tons per capita)'].isnull()]
new = new[~new['gdpPercap'].isnull()]
x = pearsonr(new['CO2 emissions (metric tons per capita)'],new["gdpPercap"])
my_dict[i] = x
print(my_dict)
{2007: (0.7204168835195922, 9.232746580444968e-22), 2002: (0.8006420656289517, 3.86356386260146e-29), 1992: (0.8094316203754854, 1.6106137786338828e-29), 1997: (0.8081396359866696, 7.976155566418277e-30), 1987: (0.8095530841191891, 3.899627362464272e-28), 1982: (0.8166384024309292, 5.56591579806399e-29), 1977: (0.7928335935904501, 2.838891967092376e-26), 1967: (0.9387918385063311, 3.397143356640955e-53), 1972: (0.8428985866268464, 1.8242919425112505e-32), 1962: (0.9260816725019472, 1.1286792210038754e-46)}
pd.DataFrame(my_dict)
| 2007 | 2002 | 1992 | 1997 | 1987 | 1982 | 1977 | 1967 | 1972 | 1962 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7.204169e-01 | 8.006421e-01 | 8.094316e-01 | 8.081396e-01 | 8.095531e-01 | 8.166384e-01 | 7.928336e-01 | 9.387918e-01 | 8.428986e-01 | 9.260817e-01 |
| 1 | 9.232747e-22 | 3.863564e-29 | 1.610614e-29 | 7.976156e-30 | 3.899627e-28 | 5.565916e-29 | 2.838892e-26 | 3.397143e-53 | 1.824292e-32 | 1.128679e-46 |
# all years
new = df_1[~df_1['CO2 emissions (metric tons per capita)'].isnull()]
new = new[~new['gdpPercap'].isnull()]
fig = px.scatter(new, x='CO2 emissions (metric tons per capita)', y="gdpPercap",
color="continent", size='pop')
fig.show()
# year 1967 only
df_1967 = df_1[df_1['Year'] == 1967]
new = df_1967[~df_1967['CO2 emissions (metric tons per capita)'].isnull()]
new = new[~new['gdpPercap'].isnull()]
fig = px.scatter(new, x='CO2 emissions (metric tons per capita)', y="gdpPercap",
color="continent", size='pop')
fig.show()
Africa has the least energy use while Oceania has the most
sns.barplot(x="continent", y="Energy use (kg of oil equivalent per capita)", data = df_1);
No significant difference.
df_1['Year'].value_counts()
2007 263 2002 263 1992 263 1997 262 1987 260 1982 260 1977 259 1967 259 1972 259 1962 259 Name: Year, dtype: int64
df_after1990 = df_1[df_1['Year'].isin([2007, 2002, 1992, 1997])]
df_after1990['Year'].value_counts()
2007 263 2002 263 1992 263 1997 262 Name: Year, dtype: int64
df_3 = df_after1990[~df_after1990['continent'].isnull()]
df_3 = df_3[~df_3['Imports of goods and services (% of GDP)'].isnull()]
sns.barplot(x="continent", y="Imports of goods and services (% of GDP)", data=df_3);
A = df_3[df_3['continent']=='Europe']['Imports of goods and services (% of GDP)'].values.tolist()
B = df_3[df_3['continent']=='Asia']['Imports of goods and services (% of GDP)'].values.tolist()
t_check=stats.ttest_ind(A,B)
print(t_check)
alpha = 0.05
if(t_check[1]<alpha):
print('there is a significant difference')
else:
print('no significant difference')
Ttest_indResult(statistic=-1.4185256887958828, pvalue=0.1575196932555432) no significant difference
Macao SAR, China and Monaco
df_1.sort_values(by=['Population density (people per sq. km of land area)'],ascending=False).head()
| Unnamed: 0 | Country Name | Year | Agriculture, value added (% of GDP) | CO2 emissions (metric tons per capita) | Domestic credit provided by financial sector (% of GDP) | Electric power consumption (kWh per capita) | Energy use (kg of oil equivalent per capita) | Exports of goods and services (% of GDP) | Fertility rate, total (births per woman) | GDP growth (annual %) | Imports of goods and services (% of GDP) | Industry, value added (% of GDP) | Inflation, GDP deflator (annual %) | Life expectancy at birth, total (years) | Population density (people per sq. km of land area) | Services, etc., value added (% of GDP) | pop | continent | gdpPercap | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1447 | 1447 | Macao SAR, China | 1997 | 0.0 | 3.640025 | 76.995004 | NaN | NaN | 83.695990 | 1.119 | -0.300000 | 53.839286 | 14.270269 | 1.110101 | 76.962756 | 20601.550000 | 85.729731 | NaN | NaN | NaN |
| 1446 | 1446 | Macao SAR, China | 1992 | 0.0 | 2.873041 | 62.279489 | NaN | NaN | 91.334843 | 1.487 | 13.300000 | 69.714714 | 19.146195 | 16.229600 | 75.779610 | 18889.950000 | 80.853805 | NaN | NaN | NaN |
| 1625 | 1625 | Monaco | 2007 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 14.582442 | NaN | NaN | 2.477889 | NaN | 17523.000000 | NaN | NaN | NaN | NaN |
| 1449 | 1449 | Macao SAR, China | 2007 | 0.0 | 3.131315 | -4.615048 | NaN | NaN | 79.310120 | 0.909 | 14.447550 | 52.068703 | 19.137789 | 8.821400 | 79.056244 | 16884.315069 | 80.862211 | NaN | NaN | NaN |
| 1448 | 1448 | Macao SAR, China | 2002 | 0.0 | 3.434371 | 20.850022 | NaN | NaN | 93.888597 | 0.852 | 8.947245 | 59.514399 | 12.111175 | -1.321622 | 78.041976 | 16451.037037 | 87.888825 | NaN | NaN | NaN |
plt.figure(figsize=(10,20))
sns.catplot(x="Year", y="Population density (people per sq. km of land area)", data=df_1);
<Figure size 720x1440 with 0 Axes>
df_pop_dens = df_1[df_1['Population density (people per sq. km of land area)'] > 10000]
#df_pop_dens
#plt.figure(figsize=(10,20))
sns.catplot(x="Country Name", y="Population density (people per sq. km of land area)",hue = 'Year', data=df_pop_dens);
sns.catplot(x="Year", y="Population density (people per sq. km of land area)",hue = 'Country Name', data=df_pop_dens);
Maldives
df_life = df_1[(df_1['Year'] == 1962) | (df_1['Year'] == 2007)]
df_life = df_life[['Country Name', 'Year', 'Life expectancy at birth, total (years)']]
df_life = df_life.dropna()
df_1962 = df_life[df_life['Year']==1962]
df_1962 = df_1962.set_index('Country Name')
df_1962.head()
| Year | Life expectancy at birth, total (years) | |
|---|---|---|
| Country Name | ||
| Afghanistan | 1962 | 33.219902 |
| Albania | 1962 | 64.162854 |
| Algeria | 1962 | 47.045000 |
| Angola | 1962 | 33.787585 |
| Antigua and Barbuda | 1962 | 62.598537 |
df_2007 = df_life[df_life['Year']==2007]
df_2007 = df_2007.set_index('Country Name')
df_2007.head()
| Year | Life expectancy at birth, total (years) | |
|---|---|---|
| Country Name | ||
| Afghanistan | 2007 | 57.833829 |
| Albania | 2007 | 76.470293 |
| Algeria | 2007 | 72.898366 |
| Angola | 2007 | 49.435732 |
| Antigua and Barbuda | 2007 | 74.803220 |
mergedDf = df_1962.merge(df_2007, left_index=True, right_index=True)
mergedDf = mergedDf.drop(['Year_x','Year_y'],axis = 1)
mergedDf.columns = ['Life expectancy 1962', 'Life expectancy 2007']
mergedDf['Increase'] = mergedDf['Life expectancy 2007'] - mergedDf['Life expectancy 1962']
mergedDf.head()
| Life expectancy 1962 | Life expectancy 2007 | Increase | |
|---|---|---|---|
| Country Name | |||
| Afghanistan | 33.219902 | 57.833829 | 24.613927 |
| Albania | 64.162854 | 76.470293 | 12.307439 |
| Algeria | 47.045000 | 72.898366 | 25.853366 |
| Angola | 33.787585 | 49.435732 | 15.648146 |
| Antigua and Barbuda | 62.598537 | 74.803220 | 12.204683 |
mergedDf.sort_values(by=['Increase'],ascending=False)
| Life expectancy 1962 | Life expectancy 2007 | Increase | |
|---|---|---|---|
| Country Name | |||
| Maldives | 38.483561 | 75.399707 | 36.916146 |
| Bhutan | 33.094146 | 66.293098 | 33.198951 |
| Timor-Leste | 34.739049 | 65.824195 | 31.085146 |
| Tunisia | 43.341683 | 74.202439 | 30.860756 |
| Oman | 44.300512 | 75.123610 | 30.823098 |
| ... | ... | ... | ... |
| Belarus | 68.635829 | 70.203415 | 1.567585 |
| Russian Federation | 67.021415 | 67.497561 | 0.476146 |
| Ukraine | 69.146098 | 68.222195 | -0.923902 |
| Lesotho | 47.402244 | 44.882220 | -2.520024 |
| Zimbabwe | 52.277902 | 44.177756 | -8.100146 |
236 rows × 3 columns